Set up data for PCA

First, we want to set some variables depending on the samples we want to use in the PCA, etc. change these before running the analysis

# define input variables -------
#sample information (tab delimited file with Sample ID, population and subspecies)
pop <- read.table("/dss/dsshome1/lxc0B/ra52qed/gitlab/swallow.projects/scripts/0_sample.lists/sample.names.85.meth.sub.pop.txt", header=T)
#any other metadata to be included
info <- read_xlsx("/dss/dsshome1/lxc0B/ra52qed/gitlab/swallow.projects/scripts/0_sample.lists/NCBI_info_updated_20220727.xlsx")
## New names:
## • `` -> `...17`
## • `` -> `...42`
#number of samples
N = 85

#color palettes for plotting
color_sub <- c( "#39ad48", #E
                "#00316e", #G
                "#d30000", #R
                #"#9716a8", #RG
                #"#FFA500", #RT
                "#be6400", #S
                "#f4d000", #T
                "#622a0f", #TV
                "grey") #NA/unknown

Load in methylation data

Here, we can also specify a subset of samples if there are individuals we want to exclude

#Create a cleaned dataframe without hybrid individuals
prop.df <- prop.168.20miss.merged

prop.df <- prop.df %>%
  dplyr::select("site.id", contains("_R_") | contains("_G_") | contains("_T_") | contains("_S_") | contains("_TV_") | contains("_E_"))

prop.df <- prop.df %>%
  mutate(chr_pos = site.id) %>%
  separate(chr_pos, into = c("chr", "pos"))

prop.df <- prop.df %>%
  filter(chr == "chr11") %>% # First, filter for chromosome 11
  filter(
    (as.numeric(pos) >= 1.8e7 & as.numeric(pos) <= 1.83e7) | # Range 1
    (as.numeric(pos) >= 1.94e7 & as.numeric(pos) <= 1.965e7) | # Range 2
    (as.numeric(pos) >= 2e7) # Range 3 (from 2e7 onwards)
  )

#load in linked methylation loci
#meth_linked <- read.table("/dss/dsslegfs01/pr53da/pr53da-dss-0034/projects/2021_SwallowRRBS/3_Linkage/1_Rall/4_chr11_inv_meth.txt", header=T)

#prop.df <- prop.df %>%
#  filter(pos %in% meth_linked$bp_meth) 

#prop.df <- prop.df %>%
#  dplyr::filter(as.numeric(pos) > 1.9e7) %>%
#  dplyr::filter(as.numeric(pos) < 2e7)

prop.df <- prop.df[,1:86]

prop.df <- prop.df %>%
  mutate(row_mean = rowMeans(select(., where(is.numeric)), na.rm = TRUE)) %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), row_mean, .))) %>%
  select(-row_mean)

Run the PCA

Plotting PCA axes

Here, we can use a function that plots the PCA so we can plot several at once

Cumulative variance of components

Here we want to plot the amount of variance described by each PC axis. We do this by using the eigenvalues and converting this to a percentage of variance.

Plot PCA

We can make plots looking at the first 3 axes of the PCA.

#PCA Loadings

Look at which sites contribute most to which axes